This example walks through creating a regression model using the Boston data frame from the MASS package.
library(MASS)
names(Boston)
[1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
[8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
str(Boston)
'data.frame': 506 obs. of 14 variables:
$ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
$ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
$ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
$ chas : int 0 0 0 0 0 0 0 0 0 0 ...
$ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
$ rm : num 6.58 6.42 7.18 7 7.15 ...
$ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
$ dis : num 4.09 4.97 4.97 6.06 6.06 ...
$ rad : int 1 2 2 3 3 3 5 5 5 5 ...
$ tax : num 296 242 242 222 222 222 311 311 311 311 ...
$ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
$ black : num 397 397 393 395 397 ...
$ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
$ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
We would like to create a model that predicts the median value of owner occupied homes (medv). Start by viewing the relationship between medv and the other variables in the Boston data frame. We will use the package car. If you do not have car installed on your machine, install it now.
library(car)
scatterplotMatrix(~ medv + lstat + black + ptratio + tax + rad + dis + age + rm + nox + chas + indus + zn + crim, data = Boston)
Comment about scatter plot
simple.lm <- lm(medv ~ lstat, data = Boston)
simple.lm
Call:
lm(formula = medv ~ lstat, data = Boston)
Coefficients:
(Intercept) lstat
34.55 -0.95
summary(simple.lm)
Call:
lm(formula = medv ~ lstat, data = Boston)
Residuals:
Min 1Q Median 3Q Max
-15.168 -3.990 -1.318 2.034 24.500
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 34.55384 0.56263 61.41 <2e-16 ***
lstat -0.95005 0.03873 -24.53 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.216 on 504 degrees of freedom
Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
anova(simple.lm)
Analysis of Variance Table
Response: medv
Df Sum Sq Mean Sq F value Pr(>F)
lstat 1 23244 23243.9 601.62 < 2.2e-16 ***
Residuals 504 19472 38.6
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(2, 2))
plot(simple.lm)
par(mfrow=c(1, 1))
library(ggplot2)
ggplot(data = Boston, aes(x = lstat, y = medv)) +
geom_point() +
theme_bw() +
stat_smooth(method = "lm")
twoiv.lm <- lm(medv ~ lstat + rm, data = Boston)
summary(twoiv.lm)
Call:
lm(formula = medv ~ lstat + rm, data = Boston)
Residuals:
Min 1Q Median 3Q Max
-18.076 -3.516 -1.010 1.909 28.131
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.35827 3.17283 -0.428 0.669
lstat -0.64236 0.04373 -14.689 <2e-16 ***
rm 5.09479 0.44447 11.463 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.54 on 503 degrees of freedom
Multiple R-squared: 0.6386, Adjusted R-squared: 0.6371
F-statistic: 444.3 on 2 and 503 DF, p-value: < 2.2e-16
par(mfrow=c(2, 2))
plot(twoiv.lm)
par(mfrow=c(1, 1))
The functions add1 and update can be used to create a model with forward selection.
SCOPE <- (~ . + crim + zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat)
mod.fs <- lm(medv ~ 1, data = Boston)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ 1
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 42716 2246.5
crim 1 6440.8 36276 2165.8 89.486 < 2.2e-16 ***
zn 1 5549.7 37167 2178.1 75.258 < 2.2e-16 ***
indus 1 9995.2 32721 2113.6 153.955 < 2.2e-16 ***
chas 1 1312.1 41404 2232.7 15.972 7.391e-05 ***
nox 1 7800.1 34916 2146.5 112.591 < 2.2e-16 ***
rm 1 20654.4 22062 1914.2 471.847 < 2.2e-16 ***
age 1 6069.8 36647 2171.0 83.478 < 2.2e-16 ***
dis 1 2668.2 40048 2215.9 33.580 1.207e-08 ***
rad 1 6221.1 36495 2168.9 85.914 < 2.2e-16 ***
tax 1 9377.3 33339 2123.1 141.761 < 2.2e-16 ***
ptratio 1 11014.3 31702 2097.6 175.106 < 2.2e-16 ***
black 1 4749.9 37966 2188.9 63.054 1.318e-14 ***
lstat 1 23243.9 19472 1851.0 601.618 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + lstat)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ lstat
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 19472 1851.0
crim 1 146.9 19325 1849.2 3.8246 0.051059 .
zn 1 160.3 19312 1848.8 4.1758 0.041527 *
indus 1 98.7 19374 1850.4 2.5635 0.109981
chas 1 786.3 18686 1832.2 21.1665 5.336e-06 ***
nox 1 4.8 19468 1852.9 0.1239 0.724966
rm 1 4033.1 15439 1735.6 131.3942 < 2.2e-16 ***
age 1 304.3 19168 1845.0 7.9840 0.004907 **
dis 1 772.4 18700 1832.5 20.7764 6.488e-06 ***
rad 1 25.1 19447 1852.4 0.6491 0.420799
tax 1 274.4 19198 1845.8 7.1896 0.007574 **
ptratio 1 2670.1 16802 1778.4 79.9340 < 2.2e-16 ***
black 1 198.3 19274 1847.8 5.1764 0.023316 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + rm)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ lstat + rm
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 15439 1735.6
crim 1 311.42 15128 1727.3 10.3341 0.0013900 **
zn 1 56.56 15383 1735.7 1.8457 0.1748999
indus 1 61.09 15378 1735.6 1.9942 0.1585263
chas 1 548.53 14891 1719.3 18.4921 2.051e-05 ***
nox 1 14.90 15424 1737.1 0.4849 0.4865454
age 1 20.18 15419 1736.9 0.6571 0.4179577
dis 1 351.15 15088 1725.9 11.6832 0.0006819 ***
rad 1 180.45 15259 1731.6 5.9367 0.0151752 *
tax 1 425.16 15014 1723.5 14.2154 0.0001824 ***
ptratio 1 1711.32 13728 1678.1 62.5791 1.645e-14 ***
black 1 512.31 14927 1720.5 17.2290 3.892e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + ptratio)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ lstat + rm + ptratio
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 13728 1678.1
crim 1 122.52 13606 1675.6 4.5115 0.0341560 *
zn 1 14.96 13713 1679.6 0.5467 0.4600162
indus 1 0.83 13727 1680.1 0.0301 0.8622688
chas 1 377.96 13350 1666.0 14.1841 0.0001854 ***
nox 1 24.81 13703 1679.2 0.9072 0.3413103
age 1 66.24 13662 1677.7 2.4291 0.1197340
dis 1 499.08 13229 1661.4 18.9009 1.668e-05 ***
rad 1 6.07 13722 1679.9 0.2218 0.6378931
tax 1 44.36 13684 1678.5 1.6242 0.2031029
black 1 389.68 13338 1665.6 14.6369 0.0001468 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + dis)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ lstat + rm + ptratio + dis
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 13229 1661.4
crim 1 233.54 12995 1654.4 8.9856 0.002857 **
zn 1 144.81 13084 1657.8 5.5337 0.019040 *
indus 1 242.65 12986 1654.0 9.3424 0.002359 **
chas 1 267.43 12962 1653.1 10.3162 0.001404 **
nox 1 759.56 12469 1633.5 30.4572 5.488e-08 ***
age 1 61.36 13168 1661.0 2.3300 0.127535
rad 1 22.40 13206 1662.5 0.8481 0.357540
tax 1 240.34 12989 1654.1 9.2519 0.002476 **
black 1 502.64 12726 1643.8 19.7482 1.089e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + nox)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ lstat + rm + ptratio + dis + nox
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 12469 1633.5
crim 1 141.43 12328 1629.7 5.7248 0.0170955 *
zn 1 151.71 12318 1629.3 6.1461 0.0134998 *
indus 1 17.10 12452 1634.8 0.6854 0.4081212
chas 1 328.27 12141 1622.0 13.4920 0.0002655 ***
age 1 0.25 12469 1635.5 0.0098 0.9211095
rad 1 53.48 12416 1633.3 2.1494 0.1432543
tax 1 10.50 12459 1635.0 0.4205 0.5169877
black 1 311.83 12158 1622.7 12.7991 0.0003806 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + chas)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ lstat + rm + ptratio + dis + nox + chas
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 12141 1622.0
crim 1 116.330 12025 1619.1 4.8178 0.0286287 *
zn 1 164.406 11977 1617.1 6.8361 0.0092038 **
indus 1 26.274 12115 1622.9 1.0800 0.2991938
age 1 2.331 12139 1623.9 0.0956 0.7572505
rad 1 58.556 12082 1621.5 2.4135 0.1209286
tax 1 4.187 12137 1623.8 0.1718 0.6786847
black 1 272.837 11868 1612.5 11.4484 0.0007719 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + black)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ lstat + rm + ptratio + dis + nox + chas + black
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 11868 1612.5
crim 1 55.633 11813 1612.1 2.3407 0.126671
zn 1 189.936 11678 1606.3 8.0832 0.004652 **
indus 1 15.584 11853 1613.8 0.6535 0.419264
age 1 9.446 11859 1614.1 0.3959 0.529516
rad 1 144.320 11724 1608.3 6.1180 0.013714 *
tax 1 2.703 11866 1614.4 0.1132 0.736643
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + zn)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 11678 1606.3
crim 1 94.712 11584 1604.2 4.0555 0.04457 *
indus 1 16.048 11662 1607.6 0.6825 0.40912
age 1 1.491 11677 1608.2 0.0633 0.80138
rad 1 93.614 11585 1604.2 4.0081 0.04583 *
tax 1 3.952 11674 1608.1 0.1679 0.68215
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + crim)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
crim
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 11584 1604.2
indus 1 15.773 11568 1605.5 0.6750 0.411725
age 1 2.470 11581 1606.1 0.1056 0.745387
rad 1 228.604 11355 1596.1 9.9656 0.001692 **
tax 1 1.305 11582 1606.1 0.0558 0.813396
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + rad)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
crim + rad
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 11355 1596.1
indus 1 33.894 11321 1596.6 1.4790 0.2245162
age 1 0.096 11355 1598.1 0.0042 0.9485270
tax 1 273.619 11081 1585.8 12.1978 0.0005214 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + tax)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions
Model:
medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn +
crim + rad + tax
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 11081 1585.8
indus 1 2.51754 11079 1587.7 0.1120 0.7380
age 1 0.06271 11081 1587.8 0.0028 0.9579
summary(mod.fs)
Call:
lm(formula = medv ~ lstat + rm + ptratio + dis + nox + chas +
black + zn + crim + rad + tax, data = Boston)
Residuals:
Min 1Q Median 3Q Max
-15.5984 -2.7386 -0.5046 1.7273 26.2373
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.341145 5.067492 7.171 2.73e-12 ***
lstat -0.522553 0.047424 -11.019 < 2e-16 ***
rm 3.801579 0.406316 9.356 < 2e-16 ***
ptratio -0.946525 0.129066 -7.334 9.24e-13 ***
dis -1.492711 0.185731 -8.037 6.84e-15 ***
nox -17.376023 3.535243 -4.915 1.21e-06 ***
chas 2.718716 0.854240 3.183 0.001551 **
black 0.009291 0.002674 3.475 0.000557 ***
zn 0.045845 0.013523 3.390 0.000754 ***
crim -0.108413 0.032779 -3.307 0.001010 **
rad 0.299608 0.063402 4.726 3.00e-06 ***
tax -0.011778 0.003372 -3.493 0.000521 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.736 on 494 degrees of freedom
Multiple R-squared: 0.7406, Adjusted R-squared: 0.7348
F-statistic: 128.2 on 11 and 494 DF, p-value: < 2.2e-16
Backward elimination starts with all of the variables in the model and removes the least significant variable one at a time.
mod.be <- lm(medv ~ . , data = Boston)
drop1(mod.be, test = "F") # single term deletions
Single term deletions
Model:
medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad +
tax + ptratio + black + lstat
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 11079 1589.6
crim 1 243.22 11322 1598.6 10.8012 0.0010868 **
zn 1 257.49 11336 1599.3 11.4351 0.0007781 ***
indus 1 2.52 11081 1587.8 0.1118 0.7382881
chas 1 218.97 11298 1597.5 9.7243 0.0019250 **
nox 1 487.16 11566 1609.4 21.6342 4.246e-06 ***
rm 1 1871.32 12950 1666.6 83.1040 < 2.2e-16 ***
age 1 0.06 11079 1587.7 0.0027 0.9582293
dis 1 1232.41 12311 1641.0 54.7305 6.013e-13 ***
rad 1 479.15 11558 1609.1 21.2788 5.071e-06 ***
tax 1 242.26 11321 1598.6 10.7585 0.0011116 **
ptratio 1 1194.23 12273 1639.4 53.0350 1.309e-12 ***
black 1 270.63 11349 1599.8 12.0187 0.0005729 ***
lstat 1 2410.84 13490 1687.3 107.0634 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.be <- update(mod.be, . ~ . - age)
drop1(mod.be, test = "F") # single term deletions
Single term deletions
Model:
medv ~ crim + zn + indus + chas + nox + rm + dis + rad + tax +
ptratio + black + lstat
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 11079 1587.7
crim 1 243.20 11322 1596.6 10.8221 0.0010747 **
zn 1 260.32 11339 1597.4 11.5841 0.0007194 ***
indus 1 2.52 11081 1585.8 0.1120 0.7379887
chas 1 219.91 11299 1595.6 9.7859 0.0018626 **
nox 1 520.87 11600 1608.9 23.1781 1.967e-06 ***
rm 1 1959.55 13038 1668.0 87.1987 < 2.2e-16 ***
dis 1 1352.26 12431 1643.9 60.1743 5.028e-14 ***
rad 1 481.09 11560 1607.2 21.4083 4.751e-06 ***
tax 1 242.24 11321 1596.6 10.7796 0.0010991 **
ptratio 1 1200.23 12279 1637.7 53.4092 1.099e-12 ***
black 1 272.26 11351 1597.9 12.1154 0.0005445 ***
lstat 1 2718.88 13798 1696.7 120.9879 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.be <- update(mod.be, . ~ . - indus)
drop1(mod.be, test = "F") # single term deletions
Single term deletions
Model:
medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio +
black + lstat
Df Sum of Sq RSS AIC F value Pr(>F)
<none> 11081 1585.8
crim 1 245.37 11327 1594.8 10.939 0.0010104 **
zn 1 257.82 11339 1595.4 11.494 0.0007543 ***
chas 1 227.21 11309 1594.0 10.129 0.0015515 **
nox 1 541.91 11623 1607.9 24.158 1.209e-06 ***
rm 1 1963.66 13045 1666.3 87.539 < 2.2e-16 ***
dis 1 1448.94 12530 1645.9 64.593 6.837e-15 ***
rad 1 500.92 11582 1606.1 22.331 2.997e-06 ***
tax 1 273.62 11355 1596.1 12.198 0.0005214 ***
ptratio 1 1206.45 12288 1636.0 53.783 9.235e-13 ***
black 1 270.82 11352 1596.0 12.073 0.0005566 ***
lstat 1 2723.48 13805 1695.0 121.411 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.be <- update(mod.be, . ~ . - age)
summary(mod.be)
Call:
lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad +
tax + ptratio + black + lstat, data = Boston)
Residuals:
Min 1Q Median 3Q Max
-15.5984 -2.7386 -0.5046 1.7273 26.2373
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.341145 5.067492 7.171 2.73e-12 ***
crim -0.108413 0.032779 -3.307 0.001010 **
zn 0.045845 0.013523 3.390 0.000754 ***
chas 2.718716 0.854240 3.183 0.001551 **
nox -17.376023 3.535243 -4.915 1.21e-06 ***
rm 3.801579 0.406316 9.356 < 2e-16 ***
dis -1.492711 0.185731 -8.037 6.84e-15 ***
rad 0.299608 0.063402 4.726 3.00e-06 ***
tax -0.011778 0.003372 -3.493 0.000521 ***
ptratio -0.946525 0.129066 -7.334 9.24e-13 ***
black 0.009291 0.002674 3.475 0.000557 ***
lstat -0.522553 0.047424 -11.019 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.736 on 494 degrees of freedom
Multiple R-squared: 0.7406, Adjusted R-squared: 0.7348
F-statistic: 128.2 on 11 and 494 DF, p-value: < 2.2e-16
## Criterion Based Selection
library(leaps)
models <- regsubsets(medv ~ ., data = Boston, nvmax = 11)
summary(models)
Subset selection object
Call: regsubsets.formula(medv ~ ., data = Boston, nvmax = 11)
13 Variables (and intercept)
Forced in Forced out
crim FALSE FALSE
zn FALSE FALSE
indus FALSE FALSE
chas FALSE FALSE
nox FALSE FALSE
rm FALSE FALSE
age FALSE FALSE
dis FALSE FALSE
rad FALSE FALSE
tax FALSE FALSE
ptratio FALSE FALSE
black FALSE FALSE
lstat FALSE FALSE
1 subsets of each size up to 11
Selection Algorithm: exhaustive
crim zn indus chas nox rm age dis rad tax ptratio black lstat
1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " " " "*"
2 ( 1 ) " " " " " " " " " " "*" " " " " " " " " " " " " "*"
3 ( 1 ) " " " " " " " " " " "*" " " " " " " " " "*" " " "*"
4 ( 1 ) " " " " " " " " " " "*" " " "*" " " " " "*" " " "*"
5 ( 1 ) " " " " " " " " "*" "*" " " "*" " " " " "*" " " "*"
6 ( 1 ) " " " " " " "*" "*" "*" " " "*" " " " " "*" " " "*"
7 ( 1 ) " " " " " " "*" "*" "*" " " "*" " " " " "*" "*" "*"
8 ( 1 ) " " "*" " " "*" "*" "*" " " "*" " " " " "*" "*" "*"
9 ( 1 ) "*" " " " " "*" "*" "*" " " "*" "*" " " "*" "*" "*"
10 ( 1 ) "*" "*" " " " " "*" "*" " " "*" "*" "*" "*" "*" "*"
11 ( 1 ) "*" "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
R2adj <- summary(models)$adjr2
R2adj
[1] 0.5432418 0.6371245 0.6767036 0.6878351 0.7051702 0.7123567 0.7182560
[8] 0.7222072 0.7252743 0.7299149 0.7348058
which.max(R2adj)
[1] 11
MCP <- summary(models)$cp # Mallow's C_p
MCP
[1] 362.75295 185.64743 111.64889 91.48526 59.75364 47.17537 37.05889
[8] 30.62398 25.86592 18.20493 10.11455
BIC <- summary(models)$bic
BIC
[1] -385.0521 -496.2582 -549.4767 -561.9884 -585.6823 -592.9553 -598.2295
[8] -600.1663 -600.5767 -603.9917 -608.0353
which.min(BIC)
[1] 11
which.min(MCP)
[1] 11
Comment on model selected with \(R^2_{adj}\), Mallow’s \(C_p\), and the Bayesian Information Criterion (BIC).
plot(models, scale = "adjr2")
plot(models, scale = "Cp")
plot(models, scale = "bic")
par(mfrow=c(2, 2))
plot(mod.be)
par(mfrow=c(1, 1))
library(car)
influenceIndexPlot(mod.be, id.n = 3)
outlierTest(mod.be)
rstudent unadjusted p-value Bonferonni p
369 5.893600 7.0113e-09 3.5477e-06
372 5.500418 6.0950e-08 3.0841e-05
373 5.325354 1.5341e-07 7.7626e-05
influencePlot(mod.be)
StudRes Hat CookD
369 5.8935999 0.05615812 0.4015156
381 -0.9956297 0.30484091 0.1903293
InflMea <- influence.measures(mod.be)
summary(InflMea) # Which observations are influential
Potentially influential observations of
lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + black + lstat, data = Boston) :
dfb.1_ dfb.crim dfb.zn dfb.chas dfb.nox dfb.rm dfb.dis dfb.rad
142 -0.10 -0.07 0.12 0.00 -0.06 0.06 -0.12 -0.16
143 0.01 0.00 0.00 -0.04 -0.03 0.00 -0.01 0.01
146 -0.02 -0.01 0.00 -0.01 0.05 0.02 0.02 -0.02
147 0.00 0.00 0.00 0.00 -0.01 0.00 0.00 0.00
152 0.00 0.00 0.00 -0.01 0.05 -0.02 0.01 -0.01
153 -0.04 -0.02 -0.02 -0.16 -0.16 0.13 -0.04 0.04
156 0.01 -0.01 0.00 -0.16 -0.14 0.02 -0.06 0.08
157 -0.01 0.00 0.00 0.00 -0.02 0.01 0.00 0.01
160 0.02 -0.01 0.00 0.02 -0.09 0.00 -0.03 0.02
162 0.04 0.05 -0.13 -0.09 0.01 0.10 -0.06 -0.16
163 -0.03 0.07 -0.11 0.31 -0.02 0.13 0.00 -0.16
167 -0.06 0.05 -0.15 -0.09 0.01 0.24 -0.02 -0.18
187 -0.11 0.03 -0.08 -0.08 0.00 0.25 -0.07 0.06
215 0.08 -0.10 -0.02 -0.01 -0.19 0.04 -0.06 0.06
229 -0.03 -0.01 -0.10 -0.06 -0.03 0.16 -0.02 0.06
234 -0.13 0.00 -0.13 -0.07 -0.01 0.28 0.03 0.04
254 -0.43 0.06 -0.25 -0.02 0.17 0.44 0.46 -0.07
358 0.02 0.01 0.00 -0.04 -0.02 0.00 -0.01 -0.01
359 -0.01 0.00 0.00 0.02 0.01 0.00 0.01 0.00
365 0.58 0.08 0.07 -0.53 -0.21 -0.59 -0.17 0.04
366 0.51 -0.12 0.16 -0.05 0.11 -0.84 -0.18 0.25
368 0.56 -0.01 0.11 0.00 -0.14 -0.64 -0.23 0.15
369 0.76 -0.19 0.19 -0.11 -0.11 -1.06_* -0.49 0.39
370 0.06 -0.05 0.04 0.62 -0.09 -0.12 -0.16 0.09
371 -0.02 -0.02 0.03 0.54 -0.07 -0.02 -0.14 0.06
372 0.18 -0.06 0.09 -0.10 -0.14 -0.20 -0.32 0.21
373 0.23 -0.03 0.14 0.88 -0.08 -0.40 -0.24 0.15
375 0.16 0.02 0.11 0.00 -0.18 -0.18 -0.13 0.16
381 0.12 -0.64 0.05 -0.01 -0.05 -0.10 -0.06 0.16
399 0.01 -0.06 0.00 0.00 0.00 -0.01 0.00 0.00
402 0.10 -0.03 -0.02 0.04 -0.02 -0.09 0.01 -0.06
405 0.00 0.05 0.00 0.00 0.00 0.00 0.00 0.00
406 0.03 -0.30 0.01 -0.01 -0.02 0.00 -0.02 0.06
411 0.00 -0.01 0.00 0.00 0.00 0.00 0.00 0.00
413 0.43 -0.03 0.08 0.04 -0.41 -0.21 -0.21 0.12
415 0.14 0.47 0.03 0.04 -0.08 -0.11 0.01 -0.05
419 0.01 0.28 -0.02 0.01 0.01 0.00 0.02 -0.07
428 -0.02 -0.11 0.01 0.00 -0.01 0.01 -0.01 0.03
489 -0.02 0.00 -0.01 0.01 -0.04 0.00 -0.02 -0.17
490 0.01 0.01 0.00 0.00 0.02 -0.01 0.01 0.06
491 -0.02 -0.03 -0.01 0.02 -0.08 0.02 -0.03 -0.23
492 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
493 -0.04 0.00 -0.02 0.01 -0.03 0.02 -0.01 -0.23
506 0.05 -0.03 -0.08 0.03 -0.10 0.06 0.10 0.08
dfb.tax dfb.ptrt dfb.blck dfb.lstt dffit cov.r cook.d hat
142 0.07 0.12 0.09 0.32 0.48_* 0.95 0.02 0.04
143 0.00 0.01 -0.01 -0.02 -0.07 1.09_* 0.00 0.07
146 0.00 -0.02 -0.03 0.03 0.09 1.08_* 0.00 0.05
147 0.00 0.00 0.00 0.00 -0.01 1.08_* 0.00 0.05
152 0.00 -0.01 0.00 -0.02 0.06 1.07_* 0.00 0.05
153 0.00 0.04 0.02 0.11 -0.30 1.08_* 0.01 0.08_*
156 -0.01 0.02 0.15 0.05 -0.30 1.09_* 0.01 0.08_*
157 0.00 0.00 0.02 0.01 -0.03 1.10_* 0.00 0.07
160 0.00 0.01 -0.01 0.04 -0.11 1.07_* 0.00 0.05
162 0.19 -0.19 0.01 -0.17 0.46 0.86_* 0.02 0.02
163 0.18 -0.10 0.01 -0.07 0.47_* 0.97 0.02 0.05
167 0.20 -0.17 0.02 -0.06 0.47_* 0.87_* 0.02 0.03
187 -0.15 0.06 0.01 0.02 0.42 0.84_* 0.01 0.02
215 -0.10 -0.07 0.00 0.40 0.49_* 0.89_* 0.02 0.03
229 -0.05 -0.03 0.00 -0.02 0.29 0.91_* 0.01 0.01
234 -0.04 -0.01 0.02 0.04 0.37 0.91_* 0.01 0.02
254 0.08 0.11 0.05 0.11 0.63_* 0.89_* 0.03 0.05
358 0.00 -0.01 -0.01 0.01 -0.05 1.07_* 0.00 0.05
359 0.00 0.00 0.00 -0.01 0.02 1.08_* 0.00 0.05
365 -0.14 -0.26 -0.09 -0.01 -0.98_* 0.83_* 0.08 0.07_*
366 -0.10 -0.04 -0.01 -0.68 0.95_* 0.92_* 0.07 0.09_*
368 -0.05 -0.08 -0.32 -0.44 0.75_* 0.92_* 0.05 0.07_*
369 0.00 -0.07 0.06 -1.10_* 1.44_* 0.48_* 0.16 0.06
370 0.13 0.10 0.06 -0.37 0.87_* 0.76_* 0.06 0.05
371 0.13 0.11 0.09 -0.30 0.78_* 0.82_* 0.05 0.05
372 0.06 0.01 0.15 -0.38 0.73_* 0.51_* 0.04 0.02
373 0.08 0.11 0.02 -0.49 1.17_* 0.55_* 0.11 0.05
375 -0.07 -0.12 0.23 0.32 0.62_* 0.89_* 0.03 0.05
381 -0.03 -0.03 -0.15 0.06 -0.66_* 1.44_* 0.04 0.30_*
399 0.00 0.00 -0.03 -0.02 -0.08 1.07_* 0.00 0.05
402 0.00 -0.02 -0.16 -0.08 -0.27 0.92_* 0.01 0.01
405 0.00 0.00 0.01 0.01 0.05 1.08_* 0.00 0.05
406 0.00 -0.01 -0.08 0.02 -0.32 1.20_* 0.01 0.16_*
411 0.00 0.00 0.01 0.01 -0.01 1.16_* 0.00 0.12_*
413 -0.05 -0.15 -0.49 0.28 0.84_* 0.80_* 0.06 0.05
415 -0.06 -0.05 -0.18 0.15 0.69_* 0.95 0.04 0.07_*
419 0.00 0.01 -0.07 -0.05 0.30 1.25_* 0.01 0.19_*
428 0.00 -0.01 0.10 0.06 -0.17 1.08_* 0.00 0.06
489 0.19 0.01 0.02 0.00 0.21 1.09_* 0.00 0.07_*
490 -0.06 0.00 0.00 -0.01 -0.07 1.11_* 0.00 0.07_*
491 0.24 0.00 -0.01 0.09 0.29 1.09_* 0.01 0.08_*
492 0.00 0.00 0.00 0.00 -0.01 1.11_* 0.00 0.07_*
493 0.24 0.02 0.02 -0.03 0.26 1.08_* 0.01 0.07_*
506 0.05 -0.22 0.01 0.14 -0.32 0.93_* 0.01 0.02
boxCox(mod.be)
boxCox(mod.be, lambda = seq(-0.1, 0.3, 0.01))
mod.log <- lm(log(medv) ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + black + lstat, data = Boston)
boxCox(mod.log)
par(mfrow=c(2, 2))
plot(mod.log)
par(mfrow=c(1, 1))
plot(mod.log, which = 1)
library(car)
residualPlots(mod.log)
Test stat Pr(>|t|)
crim 2.219 0.027
zn 1.530 0.127
chas 0.973 0.331
nox -0.632 0.527
rm 8.289 0.000
dis 4.071 0.000
rad -0.777 0.437
tax -0.125 0.901
ptratio 1.855 0.064
black -2.732 0.007
lstat 5.857 0.000
Tukey test 5.327 0.000
mod2 <- lm(medv ~ crim + zn + chas + nox + rm + I(rm^2) + dis + rad + tax + ptratio + black + lstat + I(lstat^2), data = Boston)
summary(mod.log)
Call:
lm(formula = log(medv) ~ crim + zn + chas + nox + rm + dis +
rad + tax + ptratio + black + lstat, data = Boston)
Residuals:
Min 1Q Median 3Q Max
-0.73400 -0.09460 -0.01771 0.09782 0.86290
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.0836823 0.2030491 20.112 < 2e-16 ***
crim -0.0103187 0.0013134 -7.856 2.49e-14 ***
zn 0.0010874 0.0005418 2.007 0.045308 *
chas 0.1051484 0.0342285 3.072 0.002244 **
nox -0.7217440 0.1416535 -5.095 4.97e-07 ***
rm 0.0906728 0.0162807 5.569 4.20e-08 ***
dis -0.0517059 0.0074420 -6.948 1.18e-11 ***
rad 0.0134457 0.0025405 5.293 1.82e-07 ***
tax -0.0005579 0.0001351 -4.129 4.28e-05 ***
ptratio -0.0374259 0.0051715 -7.237 1.77e-12 ***
black 0.0004127 0.0001071 3.852 0.000133 ***
lstat -0.0286039 0.0019002 -15.053 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.1898 on 494 degrees of freedom
Multiple R-squared: 0.7891, Adjusted R-squared: 0.7844
F-statistic: 168.1 on 11 and 494 DF, p-value: < 2.2e-16
summary(mod2)
Call:
lm(formula = medv ~ crim + zn + chas + nox + rm + I(rm^2) + dis +
rad + tax + ptratio + black + lstat + I(lstat^2), data = Boston)
Residuals:
Min 1Q Median 3Q Max
-26.4974 -2.2978 -0.1797 1.8022 26.4403
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 116.316659 9.262352 12.558 < 2e-16 ***
crim -0.151178 0.027816 -5.435 8.64e-08 ***
zn 0.020712 0.011587 1.788 0.074468 .
chas 2.425973 0.718922 3.374 0.000798 ***
nox -14.539960 2.995854 -4.853 1.63e-06 ***
rm -21.743779 2.784329 -7.809 3.49e-14 ***
I(rm^2) 1.963233 0.217161 9.040 < 2e-16 ***
dis -1.164971 0.158153 -7.366 7.46e-13 ***
rad 0.236699 0.053534 4.421 1.21e-05 ***
tax -0.008668 0.002846 -3.046 0.002443 **
ptratio -0.690526 0.110027 -6.276 7.63e-10 ***
black 0.007000 0.002255 3.104 0.002022 **
lstat -1.248991 0.119729 -10.432 < 2e-16 ***
I(lstat^2) 0.020466 0.003275 6.250 8.93e-10 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.984 on 492 degrees of freedom
Multiple R-squared: 0.8172, Adjusted R-squared: 0.8123
F-statistic: 169.2 on 13 and 492 DF, p-value: < 2.2e-16
boxCox(mod2)
mod3 <- lm(medv ~ crim + zn + chas + nox + rm + I(rm^2) + dis + rad + tax + ptratio + black + lstat, data = Boston)
summary(mod3)
Call:
lm(formula = medv ~ crim + zn + chas + nox + rm + I(rm^2) + dis +
rad + tax + ptratio + black + lstat, data = Boston)
Residuals:
Min 1Q Median 3Q Max
-28.1291 -2.2462 -0.2559 1.6153 26.9025
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 135.190606 9.087840 14.876 < 2e-16 ***
crim -0.131174 0.028677 -4.574 6.06e-06 ***
zn 0.033005 0.011851 2.785 0.00556 **
chas 2.443193 0.746149 3.274 0.00113 **
nox -16.784004 3.086920 -5.437 8.53e-08 ***
rm -28.796902 2.641760 -10.901 < 2e-16 ***
I(rm^2) 2.540282 0.203998 12.452 < 2e-16 ***
dis -1.176651 0.164132 -7.169 2.78e-12 ***
rad 0.240531 0.055558 4.329 1.81e-05 ***
tax -0.009542 0.002950 -3.235 0.00130 **
ptratio -0.740025 0.113898 -6.497 2.00e-10 ***
black 0.007221 0.002340 3.085 0.00215 **
lstat -0.543575 0.041440 -13.117 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.135 on 493 degrees of freedom
Multiple R-squared: 0.8027, Adjusted R-squared: 0.7979
F-statistic: 167.1 on 12 and 493 DF, p-value: < 2.2e-16
boxCox(mod3)